imports¶

In [118]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score , recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from scipy import stats

import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
In [119]:
df=pd.read_csv(r"C:\Users\mmnm2\Desktop\PROJECT 45 END\water.csv")
In [120]:
df
Out[120]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity Potability
0 NaN 204.890455 20791.318981 7.300212 368.516441 564.308654 10.379783 86.990970 2.963135 0
1 3.716080 129.422921 18630.057858 6.635246 NaN 592.885359 15.180013 56.329076 4.500656 0
2 8.099124 224.236259 19909.541732 9.275884 NaN 418.606213 16.868637 66.420093 3.055934 0
3 8.316766 214.373394 22018.417441 8.059332 356.886136 363.266516 18.436524 100.341674 4.628771 0
4 9.092223 181.101509 17978.986339 6.546600 310.135738 398.410813 11.558279 31.997993 4.075075 0
... ... ... ... ... ... ... ... ... ... ...
3271 4.668102 193.681735 47580.991603 7.166639 359.948574 526.424171 13.894419 66.687695 4.435821 1
3272 7.808856 193.553212 17329.802160 8.061362 NaN 392.449580 19.903225 NaN 2.798243 1
3273 9.419510 175.762646 33155.578218 7.350233 NaN 432.044783 11.039070 69.845400 3.298875 1
3274 5.126763 230.603758 11983.869376 6.303357 NaN 402.883113 11.168946 77.488213 4.708658 1
3275 7.874671 195.102299 17404.177061 7.509306 NaN 327.459760 16.140368 78.698446 2.309149 1

3276 rows × 10 columns

In [121]:
df.head ()
Out[121]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity Potability
0 NaN 204.890455 20791.318981 7.300212 368.516441 564.308654 10.379783 86.990970 2.963135 0
1 3.716080 129.422921 18630.057858 6.635246 NaN 592.885359 15.180013 56.329076 4.500656 0
2 8.099124 224.236259 19909.541732 9.275884 NaN 418.606213 16.868637 66.420093 3.055934 0
3 8.316766 214.373394 22018.417441 8.059332 356.886136 363.266516 18.436524 100.341674 4.628771 0
4 9.092223 181.101509 17978.986339 6.546600 310.135738 398.410813 11.558279 31.997993 4.075075 0
In [122]:
df.tail()
Out[122]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity Potability
3271 4.668102 193.681735 47580.991603 7.166639 359.948574 526.424171 13.894419 66.687695 4.435821 1
3272 7.808856 193.553212 17329.802160 8.061362 NaN 392.449580 19.903225 NaN 2.798243 1
3273 9.419510 175.762646 33155.578218 7.350233 NaN 432.044783 11.039070 69.845400 3.298875 1
3274 5.126763 230.603758 11983.869376 6.303357 NaN 402.883113 11.168946 77.488213 4.708658 1
3275 7.874671 195.102299 17404.177061 7.509306 NaN 327.459760 16.140368 78.698446 2.309149 1

Exploratory data analysis¶

In [123]:
df.size                 
Out[123]:
32760
In [124]:
df.shape
Out[124]:
(3276, 10)
In [125]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3276 entries, 0 to 3275
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ph               2785 non-null   float64
 1   Hardness         3276 non-null   float64
 2   Solids           3276 non-null   float64
 3   Chloramines      3276 non-null   float64
 4   Sulfate          2495 non-null   float64
 5   Conductivity     3276 non-null   float64
 6   Organic_carbon   3276 non-null   float64
 7   Trihalomethanes  3114 non-null   float64
 8   Turbidity        3276 non-null   float64
 9   Potability       3276 non-null   int64  
dtypes: float64(9), int64(1)
memory usage: 256.1 KB
In [126]:
df.columns
Out[126]:
Index(['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
       'Organic_carbon', 'Trihalomethanes', 'Turbidity', 'Potability'],
      dtype='object')
In [127]:
df.describe()
Out[127]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity Potability
count 2785.000000 3276.000000 3276.000000 3276.000000 2495.000000 3276.000000 3276.000000 3114.000000 3276.000000 3276.000000
mean 7.080795 196.369496 22014.092526 7.122277 333.775777 426.205111 14.284970 66.396293 3.966786 0.390110
std 1.594320 32.879761 8768.570828 1.583085 41.416840 80.824064 3.308162 16.175008 0.780382 0.487849
min 0.000000 47.432000 320.942611 0.352000 129.000000 181.483754 2.200000 0.738000 1.450000 0.000000
25% 6.093092 176.850538 15666.690297 6.127421 307.699498 365.734414 12.065801 55.844536 3.439711 0.000000
50% 7.036752 196.967627 20927.833607 7.130299 333.073546 421.884968 14.218338 66.622485 3.955028 0.000000
75% 8.062066 216.667456 27332.762127 8.114887 359.950170 481.792304 16.557652 77.337473 4.500320 1.000000
max 14.000000 323.124000 61227.196008 13.127000 481.030642 753.342620 28.300000 124.000000 6.739000 1.000000
In [128]:
df.Potability.value_counts()
Out[128]:
Potability
0    1998
1    1278
Name: count, dtype: int64
In [129]:
sns.countplot(x='Potability', data=df)
plt.title('Count of Potability')
plt.show()
No description has been provided for this image
In [130]:
df.corr()
Out[130]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity Potability
ph 1.000000 0.082096 -0.089288 -0.034350 0.018203 0.018614 0.043503 0.003354 -0.039057 -0.003556
Hardness 0.082096 1.000000 -0.046899 -0.030054 -0.106923 -0.023915 0.003610 -0.013013 -0.014449 -0.013837
Solids -0.089288 -0.046899 1.000000 -0.070148 -0.171804 0.013831 0.010242 -0.009143 0.019546 0.033743
Chloramines -0.034350 -0.030054 -0.070148 1.000000 0.027244 -0.020486 -0.012653 0.017084 0.002363 0.023779
Sulfate 0.018203 -0.106923 -0.171804 0.027244 1.000000 -0.016121 0.030831 -0.030274 -0.011187 -0.023577
Conductivity 0.018614 -0.023915 0.013831 -0.020486 -0.016121 1.000000 0.020966 0.001285 0.005798 -0.008128
Organic_carbon 0.043503 0.003610 0.010242 -0.012653 0.030831 0.020966 1.000000 -0.013274 -0.027308 -0.030001
Trihalomethanes 0.003354 -0.013013 -0.009143 0.017084 -0.030274 0.001285 -0.013274 1.000000 -0.022145 0.007130
Turbidity -0.039057 -0.014449 0.019546 0.002363 -0.011187 0.005798 -0.027308 -0.022145 1.000000 0.001581
Potability -0.003556 -0.013837 0.033743 0.023779 -0.023577 -0.008128 -0.030001 0.007130 0.001581 1.000000
In [131]:
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', linewidths=0.5)
Out[131]:
<Axes: >
No description has been provided for this image
In [132]:
Potability_data = df[df['Potability'] == 1]
Potability_data.hist(figsize=(12, 10))
plt.show()
No description has been provided for this image
In [133]:
sns.set(style="whitegrid")
plt.figure(figsize=(15, 10))

Potability_data = df[df['Potability'] == 1]

for i, column in enumerate(Potability_data.columns[:-1]):
    plt.subplot(3, 3, i + 1)
    sns.histplot(Potability_data[column], bins=20, kde=True)
    plt.title(f'Distribution of {column} for Potability ')

plt.tight_layout()
plt.show()
No description has been provided for this image
In [134]:
sns.pairplot(df, hue="Potability")
Out[134]:
<seaborn.axisgrid.PairGrid at 0x1d9eea6d890>
No description has been provided for this image

Q - What is the relationship between Chloramines and Conductivity ?¶

In [135]:
plt.scatter(df['Chloramines'], df['Conductivity'])
plt.xlabel('Chloramines')
plt.ylabel('Conductivity')
plt.title('Scatter plot: Chloramines vs Conductivity')
plt.show()
No description has been provided for this image

*Relation between the conductivity and the chloramines are concentrated between 4 - 10 for the chloramines and 300 - 600 for the conductivitu*

Q - How does the concentration of solids differ between potable and non-potable water samples?¶

In [136]:
# Bar Plot for Average Solids Concentration by Potability
plt.figure(figsize=(8, 6))
sns.barplot(data=df, x='Potability', y='Solids', ci=None)
plt.title('Average Solids Concentration by Potability')
plt.xlabel('Potability')
plt.ylabel('Average Solids Concentration')
plt.xticks(ticks=[0, 1], labels=['Non-Potable', 'Potable'])
plt.show()
No description has been provided for this image

*The bar plot shows that the average concentration of solids in non-potable water samples is less compared to potable water samples, indicating a potential relationship between higher solids concentration and water non-potability.*

Preproccessing the dataset (cleaning)¶

In [137]:
df.isnull().sum()
Out[137]:
ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64
In [138]:
columns_to_replace = ['ph', 'Sulfate', 'Trihalomethanes']
means = df[columns_to_replace].mean()
df[columns_to_replace] = df[columns_to_replace].replace(0, means)
In [139]:
df['ph'].fillna(df['ph'].mean(), inplace = True)
df['Trihalomethanes'].fillna(df['Trihalomethanes'].mean(), inplace = True)
df['Sulfate'].fillna(df['Sulfate'].mean(), inplace = True)
In [140]:
df.isnull().sum()
Out[140]:
ph                 0
Hardness           0
Solids             0
Chloramines        0
Sulfate            0
Conductivity       0
Organic_carbon     0
Trihalomethanes    0
Turbidity          0
Potability         0
dtype: int64
In [141]:
df.describe()
Out[141]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity Potability
count 3276.000000 3276.000000 3276.000000 3276.000000 3276.000000 3276.000000 3276.000000 3276.000000 3276.000000 3276.000000
mean 7.083337 196.369496 22014.092526 7.122277 333.775777 426.205111 14.284970 66.396293 3.966786 0.390110
std 1.464737 32.879761 8768.570828 1.583085 36.142612 80.824064 3.308162 15.769881 0.780382 0.487849
min 0.227499 47.432000 320.942611 0.352000 129.000000 181.483754 2.200000 0.738000 1.450000 0.000000
25% 6.279317 176.850538 15666.690297 6.127421 317.094638 365.734414 12.065801 56.647656 3.439711 0.000000
50% 7.083337 196.967627 20927.833607 7.130299 333.775777 421.884968 14.218338 66.396293 3.955028 0.000000
75% 7.870050 216.667456 27332.762127 8.114887 350.385756 481.792304 16.557652 76.666609 4.500320 1.000000
max 14.000000 323.124000 61227.196008 13.127000 481.030642 753.342620 28.300000 124.000000 6.739000 1.000000
In [142]:
df.duplicated().sum()
Out[142]:
0
In [143]:
# Handle the balanced in the dataset

X = df.drop('Potability', axis=1)
y = df['Potability']

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
In [117]:
plt.figure(figsize=(8, 6))
sns.countplot(x=y_resampled)
plt.title('Distribution of Potability')
plt.xlabel('Potability')
plt.ylabel('Count')
plt.show()
No description has been provided for this image

Q- is the distribution the proportion and Potability and non-Potability?¶

In [28]:
potability_counts = df['Potability'].value_counts()
labels = ['Non-Potable', 'Potable']
plt.pie(potability_counts, labels=labels, autopct='%1.1f%%', startangle=140)
plt.title('Proportion of Potable vs Non-Potable Water Samples ')
plt.show()
No description has been provided for this image

*we compare the proportion and Potability in non-Potability, and we found that the concentration of the non potable water is more than the potable water*

In [29]:
df.boxplot(figsize=(14,7))
Out[29]:
<Axes: >
No description has been provided for this image

Extracting the features and splitting the dataset¶

In [30]:
x = df.drop('Potability',axis=1)
y = df['Potability']
In [80]:
# splitting the dataset to train 80% and test 20%
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.20, random_state=0)
In [81]:
# Normalize the feature data
from sklearn.preprocessing import MinMaxScaler

# Normalize the feature data
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)

LogisticRegression model¶

In [82]:
model1 = LogisticRegression ()
model1.fit(X_train_normalized, y_train)
Out[82]:
LogisticRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
In [83]:
predict_LR = model1.predict(X_test_normalized)
In [84]:
accuracy_score(predict_LR, y_test)
Out[84]:
0.5175
In [85]:
report = classification_report(y_test, predict_LR)
print("LogisticRegression Classifier Report:")
print(report)
LogisticRegression Classifier Report:
              precision    recall  f1-score   support

           0       0.55      0.44      0.49       418
           1       0.50      0.60      0.54       382

    accuracy                           0.52       800
   macro avg       0.52      0.52      0.52       800
weighted avg       0.52      0.52      0.51       800

In [86]:
confusion_matrix(predict_LR, y_test)
sns.heatmap(confusion_matrix(predict_LR, y_test), annot=True)
Out[86]:
<Axes: >
No description has been provided for this image

SVM model¶

In [87]:
model2 = SVC ()
model2.fit(X_train_normalized, y_train)
Out[87]:
SVC()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SVC()
In [88]:
predict_svm = model2.predict(X_test_normalized)
In [89]:
accuracy = accuracy_score(y_test, predict_svm)
print("Accuracy:", accuracy)
Accuracy: 0.68125
In [90]:
report = classification_report(y_test, predict_svm)
print("SVC Classifier Report:")
print(report)
SVC Classifier Report:
              precision    recall  f1-score   support

           0       0.72      0.64      0.68       418
           1       0.65      0.73      0.68       382

    accuracy                           0.68       800
   macro avg       0.68      0.68      0.68       800
weighted avg       0.69      0.68      0.68       800

In [91]:
confusion_matrix(predict_svm, y_test)
sns.heatmap(confusion_matrix(predict_svm, y_test), annot=True)
Out[91]:
<Axes: >
No description has been provided for this image

DecisionTreeClassifier model¶

In [92]:
model3 = DecisionTreeClassifier ()
model3.fit(X_train_normalized, y_train)
Out[92]:
DecisionTreeClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier()
In [93]:
predict_DT = model3.predict(X_test_normalized)
In [94]:
accuracy_score(predict_DT, y_test)
Out[94]:
0.595
In [95]:
report = classification_report(y_test, predict_DT)
print("DecisionTreeClassifier Classifier Report:")
print(report)
DecisionTreeClassifier Classifier Report:
              precision    recall  f1-score   support

           0       0.63      0.53      0.58       418
           1       0.56      0.66      0.61       382

    accuracy                           0.59       800
   macro avg       0.60      0.60      0.59       800
weighted avg       0.60      0.59      0.59       800

In [96]:
confusion_matrix(predict_DT, y_test)
sns.heatmap(confusion_matrix(predict_DT, y_test), annot=True)
Out[96]:
<Axes: >
No description has been provided for this image

RandomForestClassifier model¶

In [97]:
model4 = RandomForestClassifier()
In [98]:
model4.fit(X_train_normalized, y_train)
Out[98]:
RandomForestClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier()
In [99]:
predict_RF = model4.predict(X_test_normalized)
In [100]:
accuracy_score(predict_RF, y_test)
Out[100]:
0.725
In [101]:
confusion_matrix(predict_RF, y_test)
sns.heatmap(confusion_matrix(predict_RF, y_test), annot=True)
Out[101]:
<Axes: >
No description has been provided for this image
In [102]:
report = classification_report(y_test, predict_RF)
print("RandomForestClassifier Classifier Report:")
print(report)
RandomForestClassifier Classifier Report:
              precision    recall  f1-score   support

           0       0.75      0.72      0.73       418
           1       0.70      0.73      0.72       382

    accuracy                           0.73       800
   macro avg       0.72      0.73      0.72       800
weighted avg       0.73      0.72      0.73       800

In [ ]:
 
In [ ]:
 
In [107]:
import pickle

# Save the trained model to a .pkl file
with open('RandomForestClassifier_model1.sav', 'wb') as file:
    pickle.dump(model4, file)
In [108]:
#filename = 'Water_quality_prediction.sav'
#pickle.dump(model4, open(filename,'wb'))
In [109]:
import streamlit as st
pickle.load(open('predict_LR = model1.predict(X_test_normalized)', 'rb'))
Out[109]:
RandomForestClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier()
In [ ]:
 
In [ ]:
 
In [ ]: